library(tidyverse)
library(rnoaa) # Weather data
library(imputeTS) # Imputation on the temperature
library(naniar) # Visulize missing data
library(cowplot)
library(lubridate)
library(mgcv) # Spline fitting
library(randomForest)
library(e1071) # SVM
library(ggsci) # Color
library(corrplot) # Correlation plot
library(ssr) # Semi-supervised regression
source("helper_functions.R")
# Blossom data
cherry <- read.csv("data/washingtondc.csv") %>%
bind_rows(read.csv("data/liestal.csv")) %>%
bind_rows(read.csv("data/kyoto.csv"))
# Get from the website
# https://www.nps.gov/subjects/cherryblossom/bloom-watch.htm
cherry_vc <- c("03/31/2004", "04/09/2005", "03/30/2006", "04/01/2007", "03/26/2008", "04/01/2009", "03/31/2010", "03/29/2011", "03/20/2012", "04/09/2013", "04/10/2014", "04/10/2015", "03/25/2016", "03/25/2017", "04/05/2018", "04/01/2019", "03/20/2020", "03/28/2021")
cherry_vc <- data.frame(bloom_date = cherry_vc) %>%
mutate(bloom_date = as.POSIXct(bloom_date, format = '%m/%d/%Y')) %>%
mutate(year = as.integer(format(bloom_date, "%Y"))) %>%
mutate(bloom_doy = yday(bloom_date)) %>%
mutate(location = "vancouver") %>%
mutate(bloom_date = as.character(bloom_date)) %>%
select(location, year, bloom_date, bloom_doy)
cherry <- cherry %>% select(location, year, bloom_date, bloom_doy) %>%
bind_rows(cherry_vc)
cherry %>%
filter(year >= 1950) %>%
ggplot(aes(x = year, y = bloom_doy)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
scale_x_continuous(breaks = seq(1880, 2020, by = 20)) +
facet_grid(cols = vars(str_to_title(location))) +
labs(x = "Year", y = "Peak bloom (days since Jan 1st)")
Time series of peak bloom of cherry trees since 1950 at four different sites.
p1 <- cherry %>%
ggplot(aes(x = bloom_doy, fill = location), alpha = 0.7) +
geom_bar() + labs(x = "Peak bloom (days since Jan 1st)") +
scale_fill_npg() + theme_bw() +
labs(x = "Peak bloom (days since Jan 1st)", y = "Frequency")
p2 <- cherry %>%
ggplot() +
geom_density(aes(x = bloom_doy, fill = location), alpha = 0.7) +
scale_fill_npg() +
theme_bw() +
labs(x = "Peak bloom (days since Jan 1st)", y = "Density")
ggpubr::ggarrange(p1, p2, common.legend = TRUE)
Distribution of peak bloom days across the four sites.
# Imputation plots
# Get weather data.
dailyweather <- get_ghcnd("JA000047759")
# Impute.
dailyweather <- imp_temperature(dat = dailyweather, c("tmax", "tmin"))
dailyweather %>%
filter(year >= "1994-12-01" & year <= "1995-12-01") %>%
ggplot(aes(date, tmin_imp, color = tmin_na)) +
geom_point() + theme_bw() +
labs(x = "Date", y = "Temperature") +
scale_color_discrete(name = "", labels = c("Nonmissing", "Imputed"))
# Weather data
cherry.weathers <-
tibble(location = "washingtondc", get_seasons.temperature("USC00186350")) %>%
bind_rows(tibble(location = "liestal", get_seasons.temperature("GME00127786"))) %>%
bind_rows(tibble(location = "kyoto", get_seasons.temperature("JA000047759"))) %>%
bind_rows(tibble(location = "vancouver", get_seasons.temperature("CA001108395")))
model_data <- cherry.weathers %>%
merge(cherry, by = c("year", "location"), all.x = TRUE, all.y = TRUE) %>%
filter(year >= 1950)
# Correlation plots
par(mfrow=c(2,2))
# Japan
corr_data_jp <- model_data %>%
filter(location == "kyoto") %>%
select(where(is.numeric)) %>%
drop_na()
colnames(corr_data_jp) <- c("Year", "AGDD", "FGDOY", "LGDOY",
"AFDD", "FFDOY", "LFDOY",
"Tmax(W)", "Tmax(S)", "Tmin(W)", "Tmin(S)",
"PRCP(W)", "PRCP(S)", "Bloom days")
corr_data_jp <- cor(corr_data_jp)
p1 <- corrplot(corr_data_jp, tl.col = 'black')
# Li
corr_data_li <- model_data %>%
filter(location == "liestal") %>%
select(where(is.numeric)) %>%
drop_na()
colnames(corr_data_li) <- c("Year", "AGDD", "FGDOY", "LGDOY",
"AFDD", "FFDOY", "LFDOY",
"Tmax(W)", "Tmax(S)", "Tmin(W)", "Tmin(S)",
"PRCP(W)", "PRCP(S)", "Bloom days")
corr_data_li <- cor(corr_data_li)
p2 <- corrplot(corr_data_li, tl.col = 'black')
# dc
corr_data_dc <- model_data %>%
filter(location == "washingtondc") %>%
select(where(is.numeric)) %>%
drop_na()
colnames(corr_data_dc) <- c("Year", "AGDD", "FGDOY", "LGDOY",
"AFDD", "FFDOY", "LFDOY",
"Tmax(W)", "Tmax(S)", "Tmin(W)", "Tmin(S)",
"PRCP(W)", "PRCP(S)", "Bloom days")
corr_data_dc <- cor(corr_data_dc)
p3 <- corrplot(corr_data_dc, tl.col = 'black')
# vanc
corr_data_vc <- model_data %>%
filter(location == "vancouver") %>%
select(where(is.numeric)) %>%
drop_na()
colnames(corr_data_vc) <- c("Year", "AGDD", "FGDOY", "LGDOY",
"AFDD", "FFDOY", "LFDOY",
"Tmax(W)", "Tmax(S)", "Tmin(W)", "Tmin(S)",
"PRCP(W)", "PRCP(S)", "Bloom days")
corr_data_vc <- cor(corr_data_vc)
p4 <- corrplot(corr_data_vc, tl.col = 'black')
Additional plots for weather
# 1. Montly average temperature
dailyweather <-
tibble(location = "washingtondc", get_ghcnd("USC00186350")) %>%
bind_rows(tibble(location = "liestal", get_ghcnd("GME00127786"))) %>%
bind_rows(tibble(location = "kyoto", get_ghcnd("JA000047759"))) %>%
bind_rows(tibble(location = "vancouver", get_ghcnd("CA001108395")))
dailyweather <- imp_temperature(dat = dailyweather, c("tmax", "tmin"))
month_tmax <- dailyweather %>%
group_by(month_name, location, year) %>%
summarize(tmax_avg = mean(tmax_imp),
tmin_avg = mean(tmin_imp),
prcp_sum = sum(prcp, na.rm = TRUE),
prcp_avg = mean(prcp, na.rm = TRUE))
a <- ggplot(month_tmax, aes(year, tmax_avg, color = location)) +
geom_point(size = 0.5) +
geom_smooth(method = "loess") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.title.x = element_blank(),
legend.position = "top") +
ylim(c(-10, 30)) +
scale_color_npg() +
theme_bw() +
labs(title = "Monthly mean maximum temperature",
subtitle = "January 1950 - Feburary 2022", y = "Degrees Celsius") +
facet_wrap(~ month_name) +
NULL
b <- ggplot(month_tmax, aes(year, tmin_avg, color = location)) +
geom_point(size = 0.5) +
geom_smooth(method = "loess") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.title.x = element_blank(),
legend.position = "top") +
ylim(c(-10, 30)) +
scale_color_npg() +
theme_bw() +
labs(title = "Monthly mean minimum temperature",
subtitle = "January 1950 - Feburary 2022", y = "Degrees Celsius") +
facet_wrap(~ month_name) +
NULL
ggpubr::ggarrange(a, b, common.legend = TRUE)
annual_tamx <- month_tmax %>%
group_by(year, location) %>%
summarise(tmax_avg = mean(tmax_avg),
tmin_avg = mean(tmin_avg))
a <- annual_tamx %>%
ggplot(aes(year, tmax_avg)) +
geom_point() +
geom_line() +
geom_smooth(method = "loess") +
theme(axis.title.x = element_blank(),
legend.position = "none") +
labs(title = "Annual mean maximum temperature", subtitle = ": 1980-2021", y = "Degrees Celsius")
b <- annual_tamx %>%
ggplot(aes(year, tmin_avg)) +
geom_point() +
geom_line() +
geom_smooth(method = "loess") +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "none") +
labs(title = "Annual mean minimum temperature", subtitle = ": 1980-2021", y = "Degrees Celsius")
plot_grid(a, b, nrow = 1, rel_widths = c(1, 1))
# See how many data are available for training
model_data %>%
group_by(location) %>%
summarise(n = n())
## # A tibble: 4 × 2
## location n
## <chr> <int>
## 1 kyoto 73
## 2 liestal 73
## 3 vancouver 48
## 4 washingtondc 72
model_data_jp <- model_data %>%
filter(location == "kyoto") %>% drop_na()
mod_spline1 <- gam(bloom_doy ~ s(afdd) + s(tmax_avg_Winter) + s(tmax_avg_Spring) +
s(prcp_avg_Winter) + s(tmin_avg_Spring) + s(prcp_avg_Spring), data = model_data_jp)
layout(matrix(1:6, nrow = 3))
plot(mod_spline1, shade = TRUE)
summary(mod_spline1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## bloom_doy ~ s(afdd) + s(tmax_avg_Winter) + s(tmax_avg_Spring) +
## s(prcp_avg_Winter) + s(tmin_avg_Spring) + s(prcp_avg_Spring)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97.6143 0.3271 298.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(afdd) 1.000 1.000 0.779 0.381213
## s(tmax_avg_Winter) 5.630 6.754 2.645 0.025000 *
## s(tmax_avg_Spring) 1.000 1.000 15.081 0.000271 ***
## s(prcp_avg_Winter) 1.164 1.303 0.030 0.909195
## s(tmin_avg_Spring) 1.675 2.114 1.114 0.325860
## s(prcp_avg_Spring) 1.283 1.507 1.820 0.127277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.643 Deviance explained = 70.4%
## GCV = 9.1605 Scale est. = 7.4917 n = 70
# Model 3 has the highest r.sq
mod_spline2 <- gam(bloom_doy ~ s(year, afdd) + s(tmax_avg_Winter) + s(tmin_avg_Spring) + s(tmax_avg_Spring), data = model_data_jp)
mod_spline3 <- gam(bloom_doy ~ s(afdd, prcp_avg_Spring) + s(tmax_avg_Winter) +
s(tmin_avg_Spring, tmax_avg_Spring), data = model_data_jp)
summary(mod_spline2)$r.sq
## [1] 0.7634425
summary(mod_spline3)$r.sq
## [1] 0.6580035
layout(matrix(1:4, nrow = 1))
plot(mod_spline2, shade = TRUE)
summary(mod_spline2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## bloom_doy ~ s(year, afdd) + s(tmax_avg_Winter) + s(tmin_avg_Spring) +
## s(tmax_avg_Spring)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97.6143 0.2662 366.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year,afdd) 19.062 23.457 2.093 0.02298 *
## s(tmax_avg_Winter) 2.463 3.009 1.825 0.16087
## s(tmin_avg_Spring) 1.000 1.000 8.120 0.00682 **
## s(tmax_avg_Spring) 5.855 6.905 2.372 0.04314 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.763 Deviance explained = 86.1%
## GCV = 8.5464 Scale est. = 4.9594 n = 70
gam.check(mod_spline2)
train_data <- model_data_jp[1:60, ]
test_data <- model_data_jp[61:70, ]
# mod_spline1 <- gam(bloom_doy ~ s(afdd) + s(tmax_avg_Winter) + s(tmax_avg_Spring) + s(prcp_avg_Winter) + s(tmin_avg_Spring) + s(prcp_avg_Spring), data = train_data)
# unlist(cal_pred(mod_spline1, train_data, test_data))[-c(1:70)]
#
# mod_spline2 <- gam(bloom_doy ~ s(year) + s(tmax_avg_Winter) +
# s(tmin_avg_Spring) + s(tmax_avg_Spring), data = train_data)
# unlist(cal_pred(mod_spline2, train_data, test_data))[-c(1:70)]
#
# mod_spline3 <- gam(bloom_doy ~ s(prcp_avg_Spring) + s(tmax_avg_Winter) +
# s(tmin_avg_Spring) + s(tmax_avg_Spring), data = train_data)
# unlist(cal_pred(mod_spline3, train_data, test_data))[-c(1:70)]
set.seed(1234)
# Spline
mod_spline_jp <- gam(bloom_doy ~ s(tmax_avg_Winter) + s(tmax_avg_Spring) +
s(prcp_avg_Winter) + s(prcp_avg_Spring), data = train_data)
summary(mod_spline_jp)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## bloom_doy ~ s(tmax_avg_Winter) + s(tmax_avg_Spring) + s(prcp_avg_Winter) +
## s(prcp_avg_Spring)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.2833 0.3436 286 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(tmax_avg_Winter) 5.242 6.356 2.898 0.0138 *
## s(tmax_avg_Spring) 1.853 2.319 12.351 2.3e-05 ***
## s(prcp_avg_Winter) 1.222 1.397 0.501 0.4382
## s(prcp_avg_Spring) 1.000 1.000 2.829 0.0988 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.61 Deviance explained = 67.2%
## GCV = 8.5551 Scale est. = 7.084 n = 60
layout(matrix(1:4, nrow = 2))
plot(mod_spline_jp, shade = TRUE)
unlist(cal_pred(mod_spline_jp, train_data, test_data))[-c(1:70)]
## train_mae test_mae train_mse test_mse R_sq
## 1.8833333 3.1000000 5.8500000 14.3000000 0.6717402
# random forest
rf_jp <- randomForest(bloom_doy ~ afdd + tmax_avg_Winter +
tmax_avg_Spring + tmin_avg_Spring +
prcp_avg_Spring, data = train_data,
mtry = 4/3, importance = TRUE, ntrees = 500)
unlist(cal_pred(rf_jp, train_data, test_data))[-c(1:70)]
## train_mae test_mae train_mse test_mse R_sq
## 2.5666667 3.7000000 10.3666667 21.5000000 0.4385371
# svm
svm_jp <- svm(bloom_doy ~ afdd + tmax_avg_Winter +
tmax_avg_Spring + tmin_avg_Spring +
prcp_avg_Spring, data = train_data)
unlist(cal_pred(svm_jp, train_data, test_data))[-c(1:70)]
## train_mae test_mae train_mse test_mse R_sq
## 1.7333333 3.8000000 7.1666667 22.2000000 0.5994587
# Local linear
Three_Cities_kernel_fun(loc = "kyoto", model_data_jp, bw = 0.1)
## $location
## [1] "kyoto"
##
## $R_square
## [1] 0.8479738
##
## $train.MAE
## [1] 1.083333
##
## $test.MAE
## [1] 4.3
##
## $`test fit`
## [1] 100 101 95 90 89 93 93 90 90 92
model_data_li <- model_data %>%
filter(location == "liestal") %>% drop_na()
train_data <- model_data_li[1:55, ]
test_data <- model_data_li[56:66, ]
# Spline
mod_spline_vc <- gam(bloom_doy ~ s(tmax_avg_Winter) + s(tmax_avg_Spring) +
s(prcp_avg_Winter) + s(prcp_avg_Spring), data = train_data)
summary(mod_spline_vc)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## bloom_doy ~ s(tmax_avg_Winter) + s(tmax_avg_Spring) + s(prcp_avg_Winter) +
## s(prcp_avg_Spring)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.8727 0.8873 112.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(tmax_avg_Winter) 1.000 1.000 17.722 0.000141 ***
## s(tmax_avg_Spring) 3.320 4.103 5.963 0.000688 ***
## s(prcp_avg_Winter) 5.933 6.922 2.606 0.026108 *
## s(prcp_avg_Spring) 3.891 4.696 1.627 0.170819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.694 Deviance explained = 77.4%
## GCV = 59.754 Scale est. = 43.301 n = 55
unlist(cal_pred(mod_spline_vc, train_data, test_data))[-c(1:66)]
## train_mae test_mae train_mse test_mse R_sq
## 4.5454545 6.6363636 31.6727273 70.6363636 0.7742906
# random forest
rf_vc <- randomForest(bloom_doy ~ tmax_avg_Winter +
tmax_avg_Spring + prcp_avg_Winter +
prcp_avg_Spring, data = train_data,
mtry = 4/3, importance = TRUE, ntrees = 500)
unlist(cal_pred(rf_vc, train_data, test_data))[-c(1:66)]
## train_mae test_mae train_mse test_mse R_sq
## 6.5818182 6.1818182 68.4727273 58.7272727 0.5131773
# svm
svm_vc <- svm(bloom_doy ~ tmax_avg_Winter +
tmax_avg_Spring + prcp_avg_Winter +
prcp_avg_Spring, data = train_data)
unlist(cal_pred(svm_vc, train_data, test_data))[-c(1:66)]
## train_mae test_mae train_mse test_mse R_sq
## 4.0000000 6.3636364 35.4909091 59.4545455 0.7464927
# Local linear
Three_Cities_kernel_fun(loc = "liestal", model_data_li, bw = 0.1)
## $location
## [1] "liestal"
##
## $R_square
## [1] 0.8792853
##
## $train.MAE
## [1] 2.839286
##
## $test.MAE
## [1] 8.4
##
## $`test fit`
## [1] 90 75 78 96 74 87 84 84 75 82
model_data_dc <- model_data %>%
filter(location == "washingtondc") %>% drop_na()
train_data <- model_data_dc[1:60, ]
test_data <- model_data_dc[61:70, ]
# Spline
mod_spline_dc <- gam(bloom_doy ~ s(year) + s(tmax_avg_Winter) + s(tmax_avg_Spring) + s(tmin_avg_Winter) , data = train_data)
summary(mod_spline_dc)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## bloom_doy ~ s(year) + s(tmax_avg_Winter) + s(tmax_avg_Spring) +
## s(tmin_avg_Winter)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.050 0.675 139.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 1 1 25.651 5.27e-06 ***
## s(tmax_avg_Winter) 1 1 5.294 0.0252 *
## s(tmax_avg_Spring) 1 1 6.962 0.0108 *
## s(tmin_avg_Winter) 1 1 5.029 0.0290 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.351 Deviance explained = 39.5%
## GCV = 29.82 Scale est. = 27.335 n = 60
unlist(cal_pred(mod_spline_dc, train_data, test_data))[-c(1:70)]
## train_mae test_mae train_mse test_mse R_sq
## 3.683333 5.700000 24.983333 47.300000 0.395440
# random forest
rf_dc <- randomForest(bloom_doy ~ tmax_avg_Winter +
tmax_avg_Spring + prcp_avg_Winter +
prcp_avg_Spring, data = train_data,
mtry = 4/3, importance = TRUE, ntrees = 500)
unlist(cal_pred(rf_dc, train_data, test_data))[-c(1:70)]
## train_mae test_mae train_mse test_mse R_sq
## 5.5666667 7.7000000 48.5666667 81.7000000 -0.1585278
# svm
svm_dc <- svm(bloom_doy ~ tmax_avg_Winter +
tmax_avg_Spring + prcp_avg_Winter +
prcp_avg_Spring, data = train_data)
unlist(cal_pred(svm_dc, train_data, test_data))[-c(1:70)]
## train_mae test_mae train_mse test_mse R_sq
## 3.7500000 6.7000000 29.0166667 72.5000000 0.3112832
# Local linear
Three_Cities_kernel_fun(loc = "washingtondc", model_data_dc, bw = 0.1)
## $location
## [1] "washingtondc"
##
## $R_square
## [1] 0.72053
##
## $train.MAE
## [1] 2.25
##
## $test.MAE
## [1] 4.6
##
## $`test fit`
## [1] 81 91 93 89 86 87 91 88 87 88
date_van <- c("03/31/2004", "04/09/2005", "03/30/2006", "04/01/2007", "03/26/2008", "04/01/2009", "03/31/2010", "03/29/2011", "03/20/2012", "04/09/2013", "04/10/2014", "04/10/2015", "03/25/2016", "03/25/2017", "04/05/2018", "04/01/2019", "03/20/2020", "03/28/2021")
date_van <- data.frame(date = date_van) %>%
mutate(date = as.POSIXct(date, format = '%m/%d/%Y')) %>%
mutate(year = as.integer(format(date, "%Y"))) %>%
mutate(bloom_doy = yday(date)) %>%
mutate(location = "vancouver")
model_data_vc <- model_data %>%
filter(location == "vancouver") %>%
select(-c(bloom_date, bloom_doy)) %>%
drop_na()
model_data_vc <- model_data_vc %>%
merge(date_van, by = "year", all.x = TRUE)
(Semi-supervised learning)[https://cran.r-project.org/web/packages/ssr/vignettes/ssr-package-vignette.html]
train_data <- model_data_vc[1:40, ]
train_data <- train_data %>%
select(bloom_doy, tmax_avg_Spring, tmin_avg_Spring)
test_data <- model_data_vc[41:45, ]
test_data <- test_data %>%
select(bloom_doy, tmax_avg_Spring, tmin_avg_Spring)
label_index <- which(is.na(train_data$bloom_doy) == 1)
L <- train_data[-label_index, ] # This is the labeled dataset.
U <- train_data[label_index, -1] # Remove the labels since this is the unlabeled dataset.
# Fit the model.
regressors <- list(linearRegression=lm, knn=caret::knnreg, svm=e1071::svm)
mod_ssl <- ssr("bloom_doy ~ .", L, U, regressors = regressors, testdata = test_data)
## [1] "Initial RMSE on testdata: 7.5424"
## [1] "Iteration 1 (testdata) RMSE: 6.3790 Improvement: 15.42%"
## [1] "Iteration 2 (testdata) RMSE: 6.1534 Improvement: 18.41%"
## [1] "Iteration 3 (testdata) RMSE: 6.0010 Improvement: 20.44%"
## [1] "Iteration 4 (testdata) RMSE: 5.9541 Improvement: 21.06%"
## [1] "Iteration 5 (testdata) RMSE: 5.8784 Improvement: 22.06%"
## [1] "Iteration 6 (testdata) RMSE: 5.8029 Improvement: 23.06%"
## [1] "Iteration 7 (testdata) RMSE: 5.7877 Improvement: 23.26%"
## [1] "Iteration 8 (testdata) RMSE: 5.7751 Improvement: 23.43%"
## [1] "Iteration 9 (testdata) RMSE: 5.7560 Improvement: 23.68%"
## [1] "Iteration 10 (testdata) RMSE: 5.7476 Improvement: 23.80%"
## [1] "Iteration 11 (testdata) RMSE: 5.7434 Improvement: 23.85%"
## [1] "Iteration 12 (testdata) RMSE: 5.7400 Improvement: 23.90%"
## [1] "Iteration 13 (testdata) RMSE: 5.7367 Improvement: 23.94%"
## [1] "Iteration 14 (testdata) RMSE: 5.7351 Improvement: 23.96%"
## [1] "Iteration 15 (testdata) RMSE: 5.7341 Improvement: 23.97%"
## [1] "Iteration 16 (testdata) RMSE: 5.7341 Improvement: 23.97%"
## [1] "Iteration 17 (testdata) RMSE: 5.7341 Improvement: 23.97%"
## [1] "Iteration 18 (testdata) RMSE: 5.7341 Improvement: 23.97%"
## [1] "Iteration 19 (testdata) RMSE: 5.7341 Improvement: 23.97%"
## [1] "Iteration 20 (testdata) RMSE: 5.7341 Improvement: 23.97%"
regressors <- list("lm", "rvmLinear")
plot(mod_ssl, metric = "mae", ptype = 2)
y_hat_test <- predict(mod_ssl, newdata = test_data)
test_pred <- round(y_hat_test)
test_pred
## [1] 93 95 93 89 89
test_mae <- mean(abs(test_data$bloom_doy - test_pred))
test_mae
## [1] 4.4
# Spline
# mod_spline_vc <- gam(bloom_doy ~ s(tmax_avg_Spring) , data = L)
# nonp <- round(predict(mod_spline_vc, newdata = test_data))
# mean(abs(test_data$bloom_doy - nonp))
#
# # random forest
# rf_vc <- randomForest(bloom_doy ~ tmax_avg_Spring, data = L,
# mtry = 4/3, importance = TRUE, ntrees = 500)
# nonp <- round(predict(rf_vc, newdata = test_data))
# mean(abs(test_data$bloom_doy - nonp))
#
# # svm
# svm_vc <- svm(bloom_doy ~ tmax_avg_Spring, data = L)
# nonp <- round(predict(svm_vc, newdata = test_data))
# mean(abs(test_data$bloom_doy - nonp))
source("helper_functions.R")
# dailyweather <- get_ghcnd(station_id = "JA000047759",
# date_range = c("1950-01-01", "2022-02-28"))
# dailyweather <- imp_temperature(dailyweather, c("tmax", "tmin"))
# forecast_jp <- forecast_weather(dailyweather)
forecast_jp <- forecast_weather_new(model_data_jp)
mod_spline_jp <- gam(bloom_doy ~ s(tmax_avg_Spring), data = model_data_jp)
# rf <- randomForest(bloom_doy ~ afdd + tmax_avg_Winter +
# tmax_avg_Spring + tmin_avg_Spring +
# prcp_avg_Spring, data = model_data_jp,
# mtry = 4/3, importance = TRUE, ntrees = 500)
# svmfit <- svm(bloom_doy ~ afdd + tmax_avg_Winter +
# tmax_avg_Spring + tmin_avg_Spring +
# prcp_avg_Spring, data = model_data_jp)
jp_pred <- round(predict(mod_spline_jp, newdata = forecast_jp))
# round(predict(rf, newdata = forecast_jp))
# round(predict(svmfit, newdata = forecast_jp))
# dailyweather <- get_ghcnd(station_id = "GME00127786",
# date_range = c("1950-01-01", "2022-02-28"))
# dailyweather <- imp_temperature(dailyweather, c("tmax", "tmin"))
# forecast_li <- forecast_weather(dailyweather)
forecast_li <- forecast_weather_new(model_data_li)
mod_spline_li <- gam(bloom_doy ~ s(tmax_avg_Spring), data = model_data_li)
#
# rf <- randomForest(bloom_doy ~ afdd + tmax_avg_Winter +
# tmax_avg_Spring + tmin_avg_Spring +
# prcp_avg_Spring, data = model_data_li,
# mtry = 4/3, importance = TRUE, ntrees = 500)
# svmfit <- svm(bloom_doy ~ afdd + tmax_avg_Winter +
# tmax_avg_Spring + tmin_avg_Spring +
# prcp_avg_Spring, data = model_data_li)
li_pred <- round(predict(mod_spline_li, newdata = forecast_li))
# round(predict(rf, newdata = forecast_li))
# round(predict(svmfit, newdata = forecast_li))
# dailyweather <- get_ghcnd(station_id = "USC00186350",
# date_range = c("1950-01-01", "2022-02-28"))
# dailyweather <- imp_temperature(dailyweather, c("tmax", "tmin"))
# forecast_dc <- forecast_weather(dailyweather)
forecast_dc <- forecast_weather_new(model_data_dc)
mod_spline_dc <- gam(bloom_doy ~s(tmax_avg_Spring), data = model_data_dc)
# rf <- randomForest(bloom_doy ~ afdd + tmax_avg_Winter +
# tmax_avg_Spring + tmin_avg_Spring +
# prcp_avg_Spring, data = model_data_dc,
# mtry = 4/3, importance = TRUE, ntrees = 500)
# svmfit <- svm(bloom_doy ~ afdd + tmax_avg_Winter +
# tmax_avg_Spring + tmin_avg_Spring +
# prcp_avg_Spring, data = model_data_dc)
dc_pred <- round(predict(mod_spline_dc, newdata = forecast_dc))
# round(predict(rf, newdata = forecast_li))
# round(predict(svmfit, newdata = forecast_li))
# dailyweather <- get_ghcnd(station_id = "CA001108395",
# date_range = c("1950-01-01", "2022-02-28"))
# dailyweather <- imp_temperature(dailyweather, c("tmax", "tmin"))
# forecast_vc <- forecast_weather(dailyweather)
forecast_vc <- forecast_weather_new(model_data_vc)
model_data_vc_ssl <- model_data_vc %>%
select(bloom_doy, tmax_avg_Winter, tmin_avg_Spring)
label_index <- which(is.na(model_data_vc_ssl$bloom_doy) == 1)
newL <- model_data_vc_ssl[-label_index, ]
newU <- model_data_vc_ssl[label_index, -1]
# Fit the model.
regressors <- list(linearRegression = lm, knn=caret::knnreg, svm=e1071::svm)
forecast_vc_ssl <- forecast_vc %>% select(tmax_avg_Spring)
mod_ssl <- ssr("bloom_doy ~ .", newL, newU, regressors = regressors)
## [1] "Iteration 1"
## [1] "Iteration 2"
## [1] "Iteration 3"
## [1] "Iteration 4"
## [1] "Iteration 5"
## [1] "Iteration 6"
## [1] "Iteration 7"
## [1] "Iteration 8"
## [1] "Iteration 9"
## [1] "Iteration 10"
## [1] "Iteration 11"
## [1] "Iteration 12"
## [1] "Iteration 13"
## [1] "Iteration 14"
## [1] "Iteration 15"
## [1] "Iteration 16"
## [1] "Iteration 17"
## [1] "Iteration 18"
## [1] "Iteration 19"
## [1] "Iteration 20"
# rf <- randomForest(bloom_doy ~ tmax_avg_Spring, data = newL,
# mtry = 4/3, importance = TRUE, ntrees = 500)
# svmfit <- svm(bloom_doy ~ tmax_avg_Spring, data = newL,)
vc_pred <- round(predict(mod_ssl, newdata = forecast_vc))
# round(predict(rf, newdata = forecast_vc))
# round(predict(svmfit, newdata = forecast_vc))
pred_all <- cbind(jp_pred, li_pred, dc_pred, vc_pred)
pred_all <- data.frame(year = c(2022:2031), pred_all)
colnames(pred_all) <- c("year", "kyoto", "liestal", "washingtondc", "vancouver")
write.csv(pred_all, file = "cherry-predictions.csv", row.names = FALSE)
forecast_weather_all <-
tibble(location = "washingtondc", forecast_dc, bloom_doy = dc_pred) %>%
bind_rows(tibble(location = "liestal", forecast_li, bloom_doy = li_pred)) %>%
bind_rows(tibble(location = "kyoto", forecast_jp, bloom_doy = jp_pred)) %>%
bind_rows(tibble(location = "vancouver", forecast_vc, bloom_doy = vc_pred))
p1 <- model_data %>%
select(location, year, tmax_avg_Winter, tmax_avg_Spring,
tmin_avg_Winter, tmin_avg_Spring, prcp_avg_Winter,
prcp_avg_Spring, afdd, bloom_doy) %>%
bind_rows(forecast_weather_all) %>%
ggplot(aes(x = year, y = tmax_avg_Spring, color = location)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
geom_vline(xintercept = 2022) +
scale_color_npg() +
xlim(c(1980, 2031)) +
theme_bw() +
theme(legend.position = "top") +
facet_grid(rows = vars(location)) +
labs(x = "Year", y = "Average maximum temperature in Spring (°C)")
p2 <- model_data %>%
select(location, year, tmax_avg_Winter, tmax_avg_Spring,
tmin_avg_Winter, tmin_avg_Spring, prcp_avg_Winter,
prcp_avg_Spring, afdd, bloom_doy) %>%
bind_rows(forecast_weather_all) %>%
ggplot(aes(x = year, y = afdd, color = location)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
geom_vline(xintercept = 2022) +
scale_color_npg() +
xlim(c(1980, 2031)) +
theme_bw() + theme(legend.position = "top") +
facet_grid(rows = vars(location)) +
labs(x = "Year", y = "Accumulated freezing degree days")
ggpubr::ggarrange(p1, p2, common.legend = T)
p1 <- model_data %>%
select(location, year, tmax_avg_Winter, tmax_avg_Spring,
tmin_avg_Winter, tmin_avg_Spring, prcp_avg_Winter,
prcp_avg_Spring, afdd, bloom_doy) %>%
bind_rows(forecast_weather_all) %>%
ggplot(aes(x = year, y = tmax_avg_Winter, color = location)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
geom_vline(xintercept = 2022) +
scale_color_npg() +
xlim(c(1980, 2031)) +
theme_bw() +
theme(legend.position = "top") +
facet_grid(rows = vars(location)) +
labs(x = "Year", y = "Average maximum temperature in Spring (°C)")
p2 <- model_data %>%
select(location, year, tmax_avg_Winter, tmax_avg_Spring,
tmin_avg_Winter, tmin_avg_Spring, prcp_avg_Winter,
prcp_avg_Spring, afdd, bloom_doy) %>%
bind_rows(forecast_weather_all) %>%
ggplot(aes(x = year, y = tmin_avg_Winter, color = location)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
geom_vline(xintercept = 2022) +
scale_color_npg() +
xlim(c(1980, 2031)) +
theme_bw() + theme(legend.position = "top") +
facet_grid(rows = vars(location)) +
labs(x = "Year", y = "Accumulated freezing degree days")
ggpubr::ggarrange(p1, p2, common.legend = T)
p1 <- model_data %>%
select(location, year, tmax_avg_Winter, tmax_avg_Spring,
tmin_avg_Winter, tmin_avg_Spring, prcp_avg_Winter,
prcp_avg_Spring, afdd, bloom_doy) %>%
bind_rows(forecast_weather_all) %>%
ggplot(aes(x = year, y = prcp_avg_Spring, color = location)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
geom_vline(xintercept = 2022) +
scale_color_npg() +
xlim(c(1980, 2031)) +
theme_bw() +
theme(legend.position = "top") +
facet_grid(rows = vars(location)) +
labs(x = "Year", y = "Average maximum temperature in Spring (°C)")
p2 <- model_data %>%
select(location, year, tmax_avg_Winter, tmax_avg_Spring,
tmin_avg_Winter, tmin_avg_Spring, prcp_avg_Winter,
prcp_avg_Spring, afdd, bloom_doy) %>%
bind_rows(forecast_weather_all) %>%
ggplot(aes(x = year, y = prcp_avg_Winter, color = location)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
geom_vline(xintercept = 2022) +
scale_color_npg() +
xlim(c(1980, 2031)) +
theme_bw() + theme(legend.position = "top") +
facet_grid(rows = vars(location)) +
labs(x = "Year", y = "Accumulated freezing degree days")
ggpubr::ggarrange(p1, p2, common.legend = T)
model_data %>%
select(location, year, tmax_avg_Winter, tmax_avg_Spring,
tmin_avg_Winter, tmin_avg_Spring, prcp_avg_Winter,
prcp_avg_Spring, afdd, bloom_doy) %>%
bind_rows(forecast_weather_all) %>%
ggplot(aes(x = year, y = bloom_doy, color = location)) +
geom_point() +
geom_step(linetype = 'dotted', color = 'gray50') +
geom_vline(xintercept = 2022) +
scale_color_npg() +
theme_bw() + theme(legend.position = "top") +
facet_grid(rows = vars(location)) +
labs(x = "Year", y = "Peak bloom and predicted bloom")